#!/usr/bin/python

# FIXME
# -- add more detailed preprocessing options, separate out stages explicitly
# -- page frame removal, noise removal, ...
# -- parallelize TIFF images

import sys,os,re,optparse,shutil,traceback
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from multiprocessing import Pool
import signal,fcntl
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
from pylab import *
from scipy.ndimage import measurements
from scipy.misc import imsave
from PIL import Image
import ocrolib
from ocrolib import number_of_processors
import ocropreproc
from ocrolib import ocroold

parser = optparse.OptionParser(usage="""
%prog -o dir [options] image1 image2 ...

The following applies to "-B ocroold.Standardpreprocessing"

Performs preprocessing on each of the images on the command line and stores
the resulting output in a directory with "book structure".  That is, the
input pages will be stored in dir/0001.png dir/0001.bin.png dir/0002.png
dir/0002.bin.png etc.  dir/0001.png contains the deskewed grayscale image,
while dir/0001.bin.png contains the binarized version.

Preprocessing uses hysteresis thresholding; you control it mainly through the
-L and -H arguments, which take values between 0 and 1:

* large parts of characters are missing: decrease -H
* there is too much noise in the image: increase -H
* characters are too thin or broken up: increase -L
* characters are too thick or touching: decrease -L
""")
parser.add_option("-o","--output",help="output directory",default="book")
parser.add_option("-O","--Output",help="output image.png to image.bin.png (in place)",action="store_true")
parser.add_option("-d","--display",help="display result",action="store_true")
parser.add_option("-D","--Display",help="display continuously",action="store_true")
parser.add_option("-T","--threshold",help="threshold (simple Sauvola if set)",default=-1,type=float)
parser.add_option("-L","--low",help="low threshold (0.1)",default=0.1,type=float)
parser.add_option("-H","--high",help="high threshold (0.4)",default=0.4,type=float)
parser.add_option("-W","--width",help="width parameter (40)",default=40,type=float)
parser.add_option("-r","--dpi",help="resolution (DPI) (300)",default=300,type=float)
parser.add_option("-q","--silent",action="store_true",help="disable warnings")
parser.add_option("-P","--parallel",type=int,default=number_of_processors(),help="number of parallel processes to use")
parser.add_option("-B","--binarizer",help="binarization component to be used",default="ocropreproc.CommonPreprocessing")
parser.add_option("-g","--gtextension",help="ground truth extension for copying in ground truth (include all dots)",default=None)
options,args = parser.parse_args()

if len(args)<1:
    parser.print_help()
    sys.exit(0)

if options.threshold>0:
    options.low = options.threshold
    options.high = options.threshold

if options.binarizer=="StandardPreprocessing":
    binarizer = ocrolib.StandardPreprocessing()
    binarizer.pset("binarizer","BinarizeByHT")
    binarizer.reinit()
    binarizer.command("binarizer_pset","k0",str(options.low))
    binarizer.command("binarizer_pset","k1",str(options.high))
    binarizer.command("binarizer_pset","width",str(options.width))
else:
    binarizer = ocrolib.make_IBinarize(options.binarizer)

if options.Display: options.display = 1
if options.display: ion(); show()
if options.Output:
    options.output = None
if not options.Output and os.path.exists(options.output):
    print "%s: already exists; please remove"%options.output
    sys.exit(0)

files = None
if not options.Output:
    os.mkdir(options.output)
    files = open(options.output+"/FILES","w")

def process_image(image,arg,count):
    if options.display: 
        clf()
        imshow(image,cmap=cm.gray)
        draw()
        ginput(1,timeout=1)
    bin,gray = binarizer.binarize(image)
    try:
        pass
    except:
        print "# binarizer failed; skipping",arg
        return 0
    if options.display: 
        clf()
        imshow(bin,cmap=cm.gray)
        draw()
        if not options.Display: 
            raw_input("hit ENTER to continue")
        else:
            ginput(1,timeout=1)
    if not options.silent:
        if ocrolib.quick_check_page_components(bin,dpi=options.dpi)<0.5:
            print "# skipping page"
            return 0
    if options.Output:
        dest,_ = ocrolib.allsplitext(arg)
        print "# writing",dest
        imsave(dest+".png",gray)
        imsave(dest+".bin.png",bin)
    else:
        dest = "%s/%04d" % (options.output,count)
        print "# writing",dest,gray.shape,bin.shape
        imsave(dest+".png",gray)
        imsave(dest+".bin.png",bin)
        if options.gtextension is not None:
            base,_ = ocrolib.allsplitext(arg)
            shutil.copyfile(base+options.gtextension,dest+".gt.txt")
        if files is not None:
            fcntl.flock(files,fcntl.LOCK_EX)
            files.write("%04d\t%s\n"%(count,arg))
            files.flush()
            fcntl.flock(files,fcntl.LOCK_UN)

def process1(t):
    arg,count = t
    n = 0
    for image,arg in ocrolib.page_iterator([arg]):
        assert n<2,"no multipage files with parallel processing; use -P 0"
        print "===",arg,count,image.shape
        try:
            process_image(image,arg,count)
        except:
            traceback.print_exc()
            raise
        n += 1

if options.parallel<2:
    count = 0
    for image,arg in ocrolib.page_iterator(args):
        print "===",arg,count,image.shape
        count += 1
        process_image(image,arg,count)
else:
    pool = Pool(processes=options.parallel)
    jobs = []
    for i in range(len(args)): jobs += [(args[i],i+1)]
    result = pool.map(process1,jobs)

if files is not None:
    files.close()
